home *** CD-ROM | disk | FTP | other *** search
Wrap
#!/usr/bin/perl # -w use Data::Dumper; use Tie::IxHash; use XML::Checker; use XML::Simple; use XML::Writer; use XML::Generator; use XML::Parser; use Tie::IxHash; use XML::LibXML; use IO::Handle; use IO; use XML::SimpleObject; use XML::TreeBuilder; use XML::Element; use Getopt::Std; #XML::Parser::Grove - went ok #XML::Grove - failed # should use these use XML::DOM; use XML::LibXML; use XML::LibXSLT; # to do # # update freesco # # make html print more fields # tighten code # perlize for greater cross platform compatability # add routine to get size of images # improve descriptions # fix link building problem that effects 3 links # start in base dir chdir "/docs_uncompressed"; &check_form; $fh="/docs/ubcd.xml"; &check_valid; exit; # input file open(UBCDDOCS,"<ubcd-docs.csv"); open(UBCDDOCS,"<ubcd.xml"); # files to write open(UBCDINDEX,">ubcd-index.html"); open(UBCDINDEXTABLE,">ubcd-index-table.html"); open(UBCDINDEXTXT,">ubcd-index.csv"); open(UBCDINDEXXML,">ubcd-index.xml"); open(UBCDINDEXXST,">ubcd-index.xsl"); open(UBCDINDEXDTD,">ubcd-index.dtd"); # sort input file by utility name $srt=`sort ubcd-docs.csv > ubcd-docs.csv.sorted`; $srt=`cp -f ubcd-docs.csv.sorted ubcd-docs.csv`; $srt=`rm -f ubcd-docs.csv.sorted`; @data_vars=(utility,doc1,doc1title,doc2,doc2title,doc3,doc3title,doc4,doc4title,doc5,doc5title,doc6,doc6title,doc7,doc7title,webpage,imagename,description,dosapp,category,menu,maintainer,lastupdate,version,size); @data_vars_enc=(url_orig,utility_enc,doc1_enc,doc1title_enc,doc2_enc,doc2title_enc,doc3_enc,doc3title_enc,doc4_enc,doc4title_enc,doc5_enc,doc5title_enc,doc6_enc,doc6title_enc,doc7_enc,doc7title_enc,webpage_enc,imagename_enc,description_enc,dosapp_enc,category_enc,menu_enc,maintainer_enc,lastupdate_enc,version_enc,size_enc); %titles=(utility => "Utility",doc1 => "Doc #1",doc1title => "Doc #1",doc2 => "Doc #1",doc2title => "Doc #1",doc3 => "Doc #1",doc3title => "Doc #1",doc4 => "Doc #1",doc4title => "Doc #1",doc5 => "Doc #1",doc5title => "Doc #1",doc6 => "Doc #1",doc6title => "Doc #1",doc7 => "Doc #1",doc7title => "Doc #1",webpage => "Doc #1",imagename => "Doc #1",dosapp => "Doc #1",category => "Doc #1",menu => "Doc #1",maintainer => "Doc #1",lastupdate => "Doc #1",version => "Doc #1"); @doc_vars=(doc1,doc2,doc3,doc4,doc5,doc6,doc7,webpage); $webpagetitle="Web Page"; &printdtd; &printxsl; &printtitles; print "Gathering Docs\n"; foreach $utility_info (<UBCDDOCS>){ chomp($utility_info); ($utility,$doc1,$doc1title,$doc2,$doc2title,$doc3,$doc3title,$doc4,$doc4title,$doc5,$doc5title,$doc6,$doc6title,$doc7,$doc7title,$webpage,$imagename,$description,$dosapp,$category,$menu,$maintainer,$lastupdate,$version,$size)=split /,/,$utility_info; $utility =~ s/"//g;$description =~ s/"//g;$imagename =~ s/"//g;$dosapp =~ s/"//g;$category =~ s/"//g;$menu =~ s/"//g;$maintainer =~ s/"//g;$lastupdate =~ s/"//g;$size =~ s/"//g;$version =~ s/"//g; chomp($imagename); $imagename_fixed=lc $imagename; $file_test="$imagename_fixed".".igz"; print "looking for [$file_test]\n"; if (-f "/mnt/disk/images/$file_test"){ $imagename_fixed="$imagename_fixed".".igz"; }else{ $imagename_fixed="$imagename_fixed".".img"; } chomp($size=`du -k /mnt/disk/images/$imagename_fixed|cut -f1`); if ($imagename !~ m/[a-z|A-Z]/ or $imagename =~ m/Utility/){next;}; print "\n\n\nGetting docs for [$imagename|$dosapp]\n"; mkdir $imagename; chdir "$imagename"; print UBCDINDEX qq!$utility $description !; print UBCDINDEXTXT qq!$utility, $description, !; print UBCDINDEXTABLE qq!<tr><td>$utility</td><td>$description</td>!; &printutilinfo; foreach $doc_var (@doc_vars){ if ($$doc_var =~ m/[a-z|A-Z]/){ # remove quotes $$doc_var =~ s/"//g; $doc_url = $$doc_var; # define other doc vars $doc_title_var = "$doc_var" . "title"; # remove quotes $$doc_title_var =~ s/"//g; print "Fetching [$$doc_var]\n"; system(qq!wget --continue --tries=1 --html-extension --convert-links --page-requisites --user-agent="Mozilla/4.0 (compatable; MSIE 6.0; Windows NT 5.1)" "$$doc_var"!); if ($$doc_var =~ m/.pdf$|.PDF$/){ $url_orig=$$doc_var; # convert PDFs to html $$doc_var =~ s/http:\//$imagename/g; print "PDF CONVERSION"; $cvtoutput=`pdftotext -layout -htmlmeta -eol unix -nopgbrk "/docs_uncompressed/$$doc_var" 2>&1`; print " [$cvtoutput:/docs_uncompressed/$$doc_var]\n"; unlink "/docs_uncompressed/$$doc_var"; # change converted PDF extension to html $$doc_var =~ s/.pdf$|.PDF$/.html/; # fix document name & location to be loadable via a browser. $tmp_doc_var = $$doc_var; $$doc_var =~ s/\?|=|&|\+//g; if ($$doc_var ne $tmp_doc_var){ chdir "/docs_uncompressed"; rename ("/docs_uncompressed/$tmp_doc_var","/docs_uncompressed/$$doc_var"); print "File moved [/docs_uncompressed/$tmp_doc_var|/docs_uncompressed/$$doc_var][$!]\n"; chdir "/docs_uncompressed"; } # write indexes &encode_xml_data; print UBCDINDEX qq!<A HREF="$$doc_var">$$doc_title_var [html]</A> !; print UBCDINDEXTXT qq!$$doc_title_var, $$doc_var, !; print UBCDINDEXTABLE qq!<td><a href="$$doc_var">$$doc_title_var</a></td>!; &printdoc; }else{ $url_orig=$$doc_var; # change http: to file: $$doc_var =~ s/http:\//$imagename/g; # if the document is a html type document as defined by wget ensure it ends in .html if ($$doc_var !~ m/.aspx$|.htm$|.html$|.faq$|.FAQ$|.lsm$|.txt$|.doc$/ and $$doc_var !~ m/\/$/){$$doc_var = $$doc_var . ".html"}; # fix document name & location to be loadable via a browser. $tmp_doc_var = $$doc_var; $$doc_var =~ s/\?|=|&|\+//g; if ($$doc_var ne $tmp_doc_var){ chdir "/docs_uncompressed"; rename ("/docs_uncompressed/$tmp_doc_var","/docs_uncompressed/$$doc_var"); print "File moved [/docs_uncompressed/$tmp_doc_var|/docs_uncompressed/$$doc_var][$!]\n"; chdir "/docs_uncompressed"; } # write indexes &encode_xml_data; print UBCDINDEX qq!<A HREF="$$doc_var">$$doc_title_var</A> !; print UBCDINDEXTXT qq!$$doc_title_var, $$doc_var, !; print UBCDINDEXTABLE qq!<td><a href="$$doc_var">$$doc_title_var</a></td>!; &printdoc; } } } print UBCDINDEX qq!<BR>\n!; print UBCDINDEXTXT qq!\n!; print UBCDINDEXTABLE qq!</tr>\n!; print UBCDINDEXXML qq!</utility_info>\n\n!; chdir "/docs_uncompressed"; } print UBCDINDEX qq!</body></html>\n!; print UBCDINDEXTXT qq!End\n!; print UBCDINDEXTABLE qq!</table></body></html>\n!; print UBCDINDEXXML qq!</catalog>\n!; chomp($check=`du -bs /docs_uncompressed`); print "\n\nDocs RAW Size [$check]\n\n"; print "Running html tidy on html files\n"; system('find . -name *.html -o -name *.htm -print -exec tidy -modify -upper -quiet -omit -errors {} \; > /dev/null 2>&1'); chomp($check=`du -bs /docs_uncompressed`); print "Docs after HTML Tidy Size [$check]\n\n"; print "Compressing docs_uncompressed to /cmp\n"; $rm_old=`rm -rf /cmp /docs`; $pack_result=`webpack -b /cmp/`; chomp($check=`du -bs /cmp`); print "/cmp compressed Size [$check]\n\n"; print "Moving /cmp to /docs\n"; $move=`mv -f /cmp /docs`; # Validate XML &check_form; print "Archiving docs\n"; $tgz=`tar -czf /docs.tar.gz /docs`; chomp($tgz_size=`du -bs /docs.tar.gz`); print "Archive size [$tgz_size]\n\n"; print "Done\n\n"; close; exit; sub encode_xml_data{ $utility_enc=$utility; $doc1_enc=$doc1; $doc1title_enc=$doc1title; $doc2_enc=$doc2; $doc2title_enc=$doc2title; $doc3_enc=$doc3; $doc3title_enc=$doc3title; $doc4_enc=$doc4; $doc4title_enc=$doc4title; $doc5_enc=$doc5; $doc5title_enc=$doc5title; $doc6_enc=$doc6; $doc6title_enc=$doc6title; $doc7_enc=$doc7; $doc7title_enc=$doc7title; $webpage_enc=$webpage; $imagename_enc=$imagename; $description_enc=$description; $dosapp_enc=$dosapp; $category_enc=$category; $menu_enc=$menu; $maintainer_enc=$maintainer; $lastupdate_enc=$lastupdate; $version_enc=$version; $size_enc=$size; foreach $data (@data_vars_enc){ $$data =~ s/</</g; $$data =~ s/&/&/g; $$data =~ s/>/>/g; $$data =~ s/"/"/g; $$data =~ s/'/'/g; } } sub printtitles{ # print document titles print UBCDINDEX <<EOF; <html><head><title>UBCD CD Based Docs - HTML</title></head><body> <BR>UBCD CD based docs index types: <a href="ubcd-index.html">HTML</a> <a href="ubcd-index-table.html">HTML Table</a> <a href="ubcd.xml">XML</a> <a href="ubcd-index.csv">CSV</a><BR><BR> <html><head><title>UBCD CD Based Docs - HTML</title></head><body> UTILITY DESCRIPTION DOCUMENTS<BR> EOF print UBCDINDEXTABLE <<EOF; <html><head><title>UBCD CD Based Docs - HTML Table</title></head><body> <BR>UBCD CD based docs index types: <a href="ubcd-index.html">HTML</a> <a href="ubcd-index-table.html">HTML Table</a> <a href="ubcd.xml">XML</a> <a href="ubcd-index.csv">CSV</a><BR><BR> <table border="1"> <TR><TD>UTILITY</TD><TD>DESCRIPTION</TD><TD>DOCUMENTS</TD></TR> EOF print UBCDINDEXTXT qq!UBCD CD Based Docs - Text Listing - CSV\n!; print UBCDINDEXTXT qq!UTILITY,DESCRIPTION,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,DOCUMENT,WEBPAGE,\n!; print UBCDINDEXXML <<EOF; <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="ubcd.xsl"?> <!DOCTYPE catalog SYSTEM "ubcd.dtd"> <catalog> EOF } sub printutilinfo{ &encode_xml_data; print UBCDINDEXXML <<EOF; <utility_info> <utility>$utility_enc</utility> <description>$description_enc</description> <imagename>$imagename_enc</imagename> <dosapp>$dosapp_enc</dosapp> <category>$category_enc</category> <menu>$menu_enc</menu> <maintainer>$maintainer_enc</maintainer> <lastupdate>$lastupdate_enc</lastupdate> <size>$size_enc</size> <version>$version_enc</version> EOF } sub printdoc{ $$doc_title_var =~ s/</</g; $$doc_title_var =~ s/&/&/g; $$doc_title_var =~ s/>/>/g; $$doc_title_var =~ s/"/"/g; $$doc_title_var =~ s/'/'/g; $$doc_var =~ s/</</g; $$doc_var =~ s/&/&/g; $$doc_var =~ s/>/>/g; $$doc_var =~ s/"/"/g; $$doc_var =~ s/'/'/g; $$doc_url =~ s/</</g; $$doc_url =~ s/&/&/g; $$doc_url =~ s/>/>/g; $$doc_url =~ s/"/"/g; $$doc_url =~ s/'/'/g; print UBCDINDEXXML <<EOF; <doc> <title>$$doc_title_var</title> <location>$$doc_var</location> <url>$url_orig</url> </doc> EOF } sub printdtd{ # ubcd.dtd print UBCDINDEXDTD <<EOF; <!ELEMENT catalog (utility_info*)> <!ELEMENT utility_info (utility, description, imagename, dosapp?, category, menu, maintainer, lastupdate, size?, version, doc*)> <!ELEMENT utility (#PCDATA)> <!ELEMENT description (#PCDATA)> <!ELEMENT imagename (#PCDATA)> <!ELEMENT dosapp (#PCDATA)> <!ELEMENT category (#PCDATA)> <!ELEMENT menu (#PCDATA)> <!ELEMENT maintainer (#PCDATA)> <!ELEMENT lastupdate (#PCDATA)> <!ELEMENT size (#PCDATA)> <!ELEMENT version (#PCDATA)> <!ELEMENT doc (title, location, url)> <!ELEMENT title (#PCDATA)> <!ELEMENT location (#PCDATA)> <!ELEMENT url (#PCDATA)> EOF # end ubcd.dtd } sub printxsl{ # ubcd.xsl print UBCDINDEXXST <<EOF; <?xml version="1.0"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="html" encoding="UTF-8"/> <xsl:template match="/"> <html><head><title>UBCD CD Based Docs - XML</title></head> <body> <p><b>UBCD CD based documentation</b></p> <ol> <xsl:apply-templates mode="TOC"/> </ol> <xsl:apply-templates mode="body"/> </body> </html> </xsl:template> <xsl:template match="utility_info" mode="TOC"> <li><a href="{concat('#utility', position())}"><xsl:value-of select="utility/text()"/></a></li> </xsl:template> <xsl:template match="utility_info" mode="body"> <p><a name="{concat('utility', position())}"><xsl:value-of select="text()"/></a></p> <xsl:apply-templates select="utility"/> <xsl:text> </xsl:text> <xsl:apply-templates select="version"/> <br></br> <xsl:apply-templates select="description"/> <br></br> <xsl:apply-templates select="lastupdate"/> <xsl:apply-templates select="doc"/> </xsl:template> <xsl:template match="utility"><b><xsl:value-of select="text()"/></b></xsl:template> <xsl:template match="description"><xsl:value-of select="text()"/></xsl:template> <!-- this is a way to handle docs. please uncomment this and comment the doc template under it <xsl:template match="doc"> <br></br><a href="{url}"><xsl:value-of select="title"/></a> </xsl:template> --> <xsl:template match="doc"> <br></br> <xsl:text>[documentation] </xsl:text> <xsl:value-of select="title"/> <xsl:text>: </xsl:text> <a href="{url}">web</a> <xsl:text> </xsl:text> <a href="{location}">local</a> </xsl:template> <xsl:template match="version">Version: <xsl:value-of select="text()"/></xsl:template> <xsl:template match="lastupdate">Last Updated: <xsl:value-of select="text()"/></xsl:template> </xsl:stylesheet> EOF # end ubcd.xsl } sub check_form{ my $xmlfile = "/docs/ubcd.xml"; # the file to parse # initialize parser object and parse the string my $parser = XML::Parser->new( ErrorContext => 2 ); eval { $parser->parsefile( $xmlfile ); }; # report any error that stopped parsing, or announce success if( $@ ) { $@ =~ s/at \/.*?$//s; # remove module line number print STDERR "Validating XML (Form) [ERROR in '$xmlfile']:\n$@\n"; } else { print STDERR "Validating XML (Form) ['$xmlfile' is well-formed]\n"; } } sub dump_tree{ # initialize parser and read the file $parser = new XML::Parser( Style => 'Tree' ); my $tree = $parser->parsefile( shift @ARGV ); # serialize the structure print Dumper( $tree ); } sub check_valid{ print "Validating XML (Valid) ['$fh' is "; # initialize the parser my $parser = new XML::LibXML; # open a filehandle and parse my $fh = new IO::Handle; if( $fh->fdopen( fileno( STDIN ), "r" )) { my $doc = $parser->parse_fh( $fh ); if( $doc and $doc->is_valid ) { print "valid]\n"; } else { print "invalid]\n"; } $fh->close; } } # ---- test code ---- # # handle xml declaration # sub xml_decl { my( $self, $properties ) = @_; output( "<?xml version=\"" . $properties->{'Version'} . "\"" ); my $encoding = $properties->{'Encoding'}; output( " encoding=\"$encoding\"" ) if( $encoding ); my $standalone = $properties->{'Standalone'}; output( " standalone=\"$standalone\"" ) if( $standalone ); output( "?>\n" ); } # # handle doctype declaration: # try to duplicate the original # sub doctype_decl { my( $self, $properties ) = @_; output( "\n<!DOCTYPE " . $properties->{'Name'} . "\n" ); my $pubid = $properties->{'PublicId'}; if( $pubid ) { output( " PUBLIC \"$pubid\"\n" ); output( " \"" . $properties->{'SystemId'} . "\"\n" ); } else { output( " SYSTEM \"" . $properties->{'SystemId'} . "\"\n" ); } my $intset = $properties->{'Internal'}; if( $intset ) { $in_intset = 1; output( "[\n" ); } else { output( ">\n" ); } } # # handle entity declaration in internal subset: # recreate the original declaration as it was # sub entity_decl { my( $self, $properties ) = @_; my $name = $properties->{'Name'}; output( "<!ENTITY $name " ); my $pubid = $properties->{'PublicId'}; my $sysid = $properties->{'SystemId'}; if( $pubid ) { output( "PUBLIC \"$pubid\" \"$sysid\"" ); } elsif( $sysid ) { output( "SYSTEM \"$sysid\"" ); } else { output( "\"" . $properties->{'Value'} . "\"" ); } output( ">\n" ); } # ---- next segment ---- # sub test_gen_xml{ my $output = new IO::File(">output.xml"); my $writer = new XML::Writer( OUTPUT => $output ); $writer->xmlDecl( 'UTF-8' ); $writer->doctype( 'html' ); $writer->comment( 'My happy little HTML page' ); $writer->pi( 'foo', 'bar' ); $writer->startTag( 'html' ); $writer->startTag( 'body' ); $writer->startTag( 'h1' ); $writer->startTag( 'font', 'color' => 'green' ); $writer->characters( "<Hello World!>" ); $writer->endTag( ); $writer->endTag( ); $writer->dataElement( "p", "Nice to see you." ); $writer->endTag( ); $writer->endTag( ); $writer->end( ); }